Teachers Credit Union Code Report

Table of Contents


Read in data & load packages

Data Proprocessing

Convert Data Type


tcu_cleaned <- tcu %>% 
  #select(-OriginationPurpose, -OrigBehavioralSegment, -OrigEngagementSegment) %>% 
  mutate(OriginationMonth = as.Date(OriginationMonth),
         ChangeEngagementSegment = as.factor(ChangeEngagementSegment),
         ChangeBehavioralSegment = as.factor(ChangeBehavioralSegment),
         Converted = ifelse(EngagementChangeMonth == '',0,1), #engineered our label (dependent variable)
         Converted = as.factor(Converted),
         BKHist = as.factor(BKHist),
         CollHist = as.factor(CollHist),
         CollHist2Yr = as.factor(CollHist2Yr),
         PublicHist2Yr = as.factor(PublicHist2Yr),
         Delq30Hist2Yr = as.factor(Delq30Hist2Yr),
         Delq60Hist2Yr = as.factor(Delq60Hist2Yr),
         Delq90Hist2Yr = as.factor(Delq90Hist2Yr),
         OrigHouseholdWasRetail = as.factor(as.character(OrigHouseholdWasRetail)),
         BKHist = as.factor(BKHist),
         CollHist = as.factor(CollHist),
         CollHist2Yr = as.factor(CollHist2Yr),
         PublicHist2Yr = as.factor(PublicHist2Yr))

tcu_cleaned <- tcu_cleaned %>% 
  mutate(EngagementChangeMonth = as.Date(EngagementChangeMonth, '%Y/%m/%d'),
         WelcomeLetter = as.Date(WelcomeLetter, '%Y/%m/%d'),
         PreapprovalLetter = as.Date(PreapprovalLetter, '%Y/%m/%d'),
         PreapprovalMessage = as.Date(PreapprovalMessage,'%Y/%m/%d'),
         PreapprovalConvo = as.Date(PreapprovalConvo, '%Y/%m/%d'),
         WelcomeMessage = as.Date(WelcomeMessage, '%Y/%m/%d'),
         WelcomeConvo = as.Date(WelcomeConvo, '%Y/%m/%d'))

Data Cleaning

Check NAs.


## Check NAs
sapply(tcu_cleaned, function(x) sum(is.na(x)))

        OriginationMonth       BaseXPMembershipID 
                       0                        0 
      OriginationPurpose    OrigBehavioralSegment 
                       0                        0 
   OrigEngagementSegment                  OrigZip 
                       0                        2 
        NbrIndirectLoans OrigBalanceIndirectLoans 
                       0                        0 
           OrigHousehold         AgeAtOrigination 
                       0                        4 
     DecisionCreditScore            MonthlyIncome 
                       0                        0 
     BorrowerCreditScore          BankruptcyTotal 
                   94082                    17505 
         CollectionTotal         OtherPublicTotal 
                   17505                    17505 
                  BKHist                 CollHist 
                   17505                    17505 
             CollHist2Yr            PublicHist2Yr 
                   17505                    17505 
           Delq30Hist2Yr            Delq60Hist2Yr 
                   17505                    17505 
           Delq90Hist2Yr             TrdDeptStore 
                   17505                    17505 
             TrdBankCard                  TrdBank 
                   17505                    17505 
                 TrdAuto                TrdOthFin 
                   17505                    17505 
                  TrdOil              TrdMortgage 
                   17505                    17505 
                TrdOther            Employer1Name 
                   17505                       22 
HousePaymentRentalAmount             OtherExpense 
                    9324                     9324 
              InitialLTV            WelcomeLetter 
                    9402                    96228 
       PreapprovalLetter       PreapprovalMessage 
                  138931                   183129 
        PreapprovalConvo           WelcomeMessage 
                  182852                   183245 
            WelcomeConvo    EngagementChangeMonth 
                  182713                   179354 
 ChangeEngagementSegment  ChangeBehavioralSegment 
                       0                        0 
  OrigHouseholdWasRetail             VehiclesUsed 
                       0                        4 
             VehiclesNew      WghtAvgContractRate 
                       4                        4 
                 AvgTerm    AvgDaysToFirstPayment 
                       4                        4 
        SumScheduledPmts     SumLoanPaymentTotals 
                       4                       10 
    DisabilityInsPolices       ExtendedWarranties 
                       4                        4 
              GapWaivers     JointLifeInsPolicies 
                       4                        4 
         LifeInsPolicies                   Rebook 
                       4                        4 
             VehicleMake             VehicleModel 
                       0                        0 
             VehicleYear                Converted 
                      58                        0 

Clean Age


## Remove Age Outliers
tcu_cleaned <- tcu_cleaned %>% 
  filter(AgeAtOrigination >= 18) %>% 
  filter(!is.na(AgeAtOrigination))

Clean Decision Credit Scores


credit_score_outliers <- tcu_cleaned %>% 
  filter(DecisionCreditScore < 300 | DecisionCreditScore > 850) %>% #The normal range of credit score in the US is from 300 to 850, we see there are around 2000+ customers' decision credit scores are out of the range
  mutate(DecisionCreditScore = ifelse(!is.na(BorrowerCreditScore), BorrowerCreditScore, DecisionCreditScore)) # so we decide to use their borrower credit score to replace decision credit score with borrower if exist


mean_credit_score <- tcu_cleaned %>% 
  filter(Converted == "1") %>% 
  filter(DecisionCreditScore >= 300 & DecisionCreditScore <= 850) %>% 
  summarize(mean_score = round(mean(DecisionCreditScore)))

unlist(mean_credit_score)

mean_score 
       729 

credit_score_outliers <- credit_score_outliers %>% #impute credit score for converted customers
  filter(!is.na(EngagementChangeMonth)) %>% ## if the decision credit score is less than 300, I think we can use the mean of the converted customers
  mutate(DecisionCreditScore = ifelse(DecisionCreditScore < 300, unlist(mean_credit_score), DecisionCreditScore), 
        DecisionCreditScore = ifelse(DecisionCreditScore > 850, 850, DecisionCreditScore)) ## concerns: we have a lot of 300 engaged customers, which is hard to believe that those low-credit loan customers were actually engaged

tcu_cleaned<- tcu_cleaned %>% 
  filter(DecisionCreditScore >= 300 & DecisionCreditScore <= 850)

## Use rbind to merge the two dfs
tcu_cleaned_new = rbind(tcu_cleaned, credit_score_outliers)

summary(tcu_cleaned_new$DecisionCreditScore)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  380.0   705.0   752.0   751.5   806.0   850.0 

Clean Monthly Income


tcu_cleaned_income1 <- tcu_cleaned_new %>% 
  filter(MonthlyIncome >= 2000)
#length(unique(tcu_cleaned_income1$OrigHousehold))

anti_join_tcu <- tcu_cleaned_income1%>% 
  filter(MonthlyIncome >= 20000 & MonthlyIncome < 50000)

tcu_cleaned_income_2 <- tcu_cleaned_income1 %>% 
  anti_join(anti_join_tcu, by = "BaseXPMembershipID")

tcu_cleaned_income_3 <- tcu_cleaned_income_2 %>% 
  mutate(MonthlyIncome = ifelse(MonthlyIncome >= 50000, MonthlyIncome/12, MonthlyIncome)) %>% 
  filter(MonthlyIncome <= 20000)

Convert data type 2


tcu_cleaned_4 <- tcu_cleaned_income_3 %>% 
  mutate(BankruptcyTotal = ifelse(is.na(BankruptcyTotal), 0, BankruptcyTotal),
         CollectionTotal = ifelse(is.na(CollectionTotal), 0, CollectionTotal),
         OtherPublicTotal = ifelse(is.na(OtherPublicTotal), 0, OtherPublicTotal),
         BKHist = ifelse(is.na(BKHist), FALSE, BKHist),
         CollHist = ifelse(is.na(CollHist), FALSE, CollHist),
         CollHist2Yr = ifelse(is.na(CollHist2Yr), FALSE, CollHist2Yr),
         PublicHist2Yr = ifelse(is.na(PublicHist2Yr), FALSE, PublicHist2Yr),
         Delq30Hist2Yr = ifelse(is.na(Delq30Hist2Yr), FALSE, Delq30Hist2Yr),
         Delq60Hist2Yr = ifelse(is.na(Delq60Hist2Yr), FALSE, Delq60Hist2Yr),
         Delq90Hist2Yr = ifelse(is.na(Delq90Hist2Yr), FALSE, Delq90Hist2Yr),
         TrdDeptStore = ifelse(is.na(TrdDeptStore), FALSE, TrdDeptStore),
         TrdBankCard = ifelse(is.na(TrdBankCard), 0, TrdBankCard),
         TrdBank = ifelse(is.na(TrdBank), 0, TrdBank),
         TrdAuto = ifelse(is.na(TrdAuto), 0, TrdAuto),
         TrdOthFin = ifelse(is.na(TrdOthFin), 0, TrdOthFin),
         TrdOil = ifelse(is.na(TrdOil), 0, TrdOil),
         TrdMortgage = ifelse(is.na(TrdMortgage), 0, TrdMortgage),
         TrdOther = ifelse(is.na(TrdOther), 0, TrdOther)) %>%
  filter(!is.na(HousePaymentRentalAmount)) %>% 
  mutate(total_expense = HousePaymentRentalAmount + OtherExpense) %>% 
  dplyr::select(-HousePaymentRentalAmount, -OtherExpense) %>% 
  filter(!is.na(WghtAvgContractRate)) %>% 
  filter(!is.na(OrigZip))

tcu_cleaned_4 <- tcu_cleaned_4 %>% 
  mutate(VehicleMake = toupper(VehicleMake))

Clean InitialLTV


test5 <- tcu_cleaned_4 %>% 
  filter(is.na(InitialLTV)) %>% 
  select(VehicleMake)

table(test5$VehicleMake)

           AUDI           BUICK       CHEVROLET CHEVROLET TRUCK 
              1               3               8               3 
       CHRYSLER           DODGE     DODGE TRUCK            FORD 
              1               4               4               8 
     FORD TRUCK     FORD TRUCKS             GMC      GMC TRUCKS 
              2               1               3               1 
          HONDA         HYUNDAI            JEEP             KIA 
              5               3               5               1 
          MAZDA         MERCURY          NISSAN         PONTIAC 
              1               3               1               2 
         SATURN          SUBARU          TOYOTA      VOLKSWAGEN 
              2               1               4               1 

unique(test5$VehicleMake)

 [1] "TOYOTA"          "CHEVROLET"       "HYUNDAI"        
 [4] "CHEVROLET TRUCK" "MERCURY"         "DODGE"          
 [7] "FORD"            "GMC TRUCKS"      "GMC"            
[10] "DODGE TRUCK"     "FORD TRUCK"      "JEEP"           
[13] "HONDA"           "BUICK"           "SUBARU"         
[16] "VOLKSWAGEN"      "PONTIAC"         "SATURN"         
[19] "MAZDA"           "AUDI"            "KIA"            
[22] "CHRYSLER"        "NISSAN"          "FORD TRUCKS"    

## Car 
car <- c("AUDI", "BUICK", "CHEVROLET", "CHRYSLER", "DODGE", "FORD", "GMC", "HONDA", "HYUNDAI", "JEEP", "KIA", "MAZDA", "MERCURY", "NISSAN",
         "PONTIAC", "SATURN", "SUBARU", "TOYOTA", "VOLKSWAGEN")
car <- data.frame(VehicleMake = car)

## fuzzy match?

library(stringdist)
library(fuzzyjoin)

fuzzy_join_vehicle <- stringdist_left_join(tcu_cleaned_4, car, by = "VehicleMake",
                     method = "jw", distance_col = "distance", max_dist = .2)

missing_ltv_result <- fuzzy_join_vehicle %>% 
  filter(!is.na(VehicleMake.y)) %>% 
  select(VehicleMake.x, VehicleMake.y, distance, InitialLTV) %>% 
  group_by(VehicleMake.y) %>% 
  summarize(mean_InitialLTV = mean(InitialLTV, na.rm = TRUE)) %>% 
  rename(VehicleMake = VehicleMake.y)
  
## remove the NA Initial lTV and then rbind

tcu_na_initial_ltv <- tcu_cleaned_4 %>% 
  filter(is.na(InitialLTV))

tcu_cleaned_5 <- tcu_cleaned_4 %>% 
  filter(!is.na(InitialLTV))

## Join the mean LTV to `tcu_na_initial_ltv`

final_impute_ltv <- stringdist_left_join(tcu_na_initial_ltv, missing_ltv_result, by = "VehicleMake",
                     method = "jw", distance_col = "distance", max_dist = .20)

final_impute_ltv_df <- final_impute_ltv %>% 
  #select(VehicleMake.x, VehicleMake.y, InitialLTV,mean_InitialLTV) %>% 
  mutate(mean_InitialLTV = ifelse(VehicleMake.x == "FORD TRUCKS" | VehicleMake.x == "FORD TRUCK", 194.72949, mean_InitialLTV)) %>% 
  mutate(mean_InitialLTV = ifelse(VehicleMake.x == "GMC TRUCKS", 82.97587,mean_InitialLTV)) %>% 
  ## replace the NA values with mean_InitialLTV
  mutate(InitialLTV = ifelse(is.na(InitialLTV), mean_InitialLTV, InitialLTV)) %>% 
  rename(VehicleMake = VehicleMake.x) %>% 
  dplyr::select(-VehicleMake.y, -mean_InitialLTV, -distance)

## get the mean LTV for those car models.
tcu_cleaned_6 <- rbind(tcu_cleaned_5, final_impute_ltv_df)
summary(tcu_cleaned_6$InitialLTV)

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
      0.0      74.8      90.2     176.4     103.0 3080200.0 

Convert data type3


tcu_cleaned_7 <- tcu_cleaned_6 %>%
  mutate(WL=ifelse(is.na(WelcomeLetter),0,1)) %>% 
  mutate(PL=ifelse(is.na(PreapprovalLetter),0,1)) %>%
  mutate(PM=ifelse(is.na(PreapprovalMessage),0,1)) %>%
  mutate(PC=ifelse(is.na(PreapprovalConvo),0,1)) %>%
  mutate(WM=ifelse(is.na(WelcomeMessage),0,1)) %>%
  mutate(WC=ifelse(is.na(WelcomeConvo),0,1)) %>% 
  mutate_at(c("WL", "PL", "PM","PC","WM",  "WC"), as.factor)

Get rid of unneeded columns


tcu_cleaned_7 <- tcu_cleaned_7 %>% 
  select(-BaseXPMembershipID, -OriginationPurpose, -OrigBehavioralSegment, -OrigEngagementSegment, -OrigHousehold) %>% 
  mutate(OrigZip = as.character(OrigZip))

tcu_cleaned_7 <- tcu_cleaned_7 %>% 
  dplyr::select(-WelcomeLetter, -PreapprovalLetter, -PreapprovalMessage, -PreapprovalConvo, -WelcomeMessage, -WelcomeConvo) %>% 
  dplyr::rename(WelcomeLetter = WL) %>% 
  dplyr::rename(PreapprovalLetter = PL) %>% 
  dplyr::rename(PreapprovalMessage = PM) %>% 
  dplyr::rename(PreapprovalConvo = PC) %>% 
  dplyr::rename(WelcomeMessage = WM) %>% 
  dplyr::rename(WelcomeConvo = WC)

Data Exploration

Continuous Variables

Decision Credit Score


library(patchwork)

decision_histogram <- ggplot(tcu_cleaned_7, aes(x = DecisionCreditScore, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() + 
  labs(title = "DecisionCreditScore") +
  theme_bw() +
  theme(legend.position = "none")

decision_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = DecisionCreditScore, fill = Converted)) +
  geom_boxplot() +
  theme_minimal() +
  theme_bw()

decision_histogram | decision_boxplot


#decision_histogram

Age


age_histogram <- ggplot(tcu_cleaned_7, aes(x = AgeAtOrigination, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(title = "AgeAtOrigination") +
  theme_bw()+
  theme(legend.position = "none")

age_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = AgeAtOrigination, fill = Converted)) +
  geom_boxplot() +
  theme_minimal() +
  theme_bw()

age_histogram | age_boxplot

Monthly Income


monthly_income_histogram <- tcu_cleaned_7 %>% 
  #filter(MonthlyIncome < 60000) %>% 
  ggplot(aes(x = MonthlyIncome, fill = Converted)) +
  geom_density(alpha = 0.3) +
  theme_minimal() +
  labs(title = "MonthlyIncome") +
  theme_bw() +
  theme(legend.position = "none")

monthly_income_boxplot <- tcu_cleaned_7 %>% 
  #filter(MonthlyIncome < 20000) %>% 
  ggplot(aes(x = Converted, y = MonthlyIncome, fill = Converted)) +
  geom_boxplot() +
  theme_minimal() +
  theme_bw()

monthly_income_histogram | monthly_income_boxplot

SumLoanPaymentsTotal


SumLoanPaymentTotals_histogram <- ggplot(tcu_cleaned_7, aes(x = SumLoanPaymentTotals, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  ggtitle("SumLoanPaymentsTotal")+
  theme_bw() +
  theme(legend.position = "none")

SumLoanPaymentTotals_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = SumLoanPaymentTotals, fill = Converted)) +
  geom_boxplot() +
  theme_minimal() +
  theme_bw()


SumLoanPaymentTotals_histogram | SumLoanPaymentTotals_boxplot

Total Expense


## other
totalexpense_histogram <- tcu_cleaned_7 %>% 
  filter(total_expense < 6000) %>% 
  ggplot(aes(x = total_expense, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(title = "TotalExpense") +
  theme_bw() +
    theme(legend.position = "none")

totalexpense_boxplot <- tcu_cleaned_7 %>% 
  filter(total_expense < 6000) %>% 
  ggplot(aes(x = Converted, y = total_expense, fill = Converted)) +
  geom_boxplot() +
  theme_minimal() +
  theme_bw()

totalexpense_histogram | totalexpense_boxplot

TrdBankCard


TrdBankCard_histogram <- ggplot(tcu_cleaned_7, aes(x = TrdBankCard, fill = Converted)) +
  geom_histogram() +
  theme_minimal()

TrdBankCard_boxplot <- ggplot(tcu_cleaned_7, aes(x = Converted, y = TrdBankCard, fill = Converted)) +
  geom_boxplot() +
  theme_minimal()

TrdBankCard_histogram | TrdBankCard_boxplot

AvgDaysToFirstPayment


AvgDaysToFirstPayment_histogram <- tcu_cleaned_7 %>% 
  filter(AvgDaysToFirstPayment <= 150) %>% 
  ggplot(aes(x = AvgDaysToFirstPayment, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(title = "AvgDaysToFirstPayment") +
  theme(legend.position = "none")

AvgDaysToFirstPayment_boxplot <- tcu_cleaned_7 %>% 
  filter(AvgDaysToFirstPayment <= 150) %>% 
  ggplot(aes(x = Converted, y = AvgDaysToFirstPayment, fill = Converted)) +
  geom_boxplot() +
  theme_minimal()

AvgDaysToFirstPayment_histogram | AvgDaysToFirstPayment_boxplot

AvgTerm


AvgTerm_histogram <- tcu_cleaned_7 %>% 
  ggplot(aes(x = AvgTerm, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(title = "AvgTerm") +
  theme(legend.position = "none")

AvgTerm_boxplot <- tcu_cleaned_7 %>% 
  ggplot(aes(x = Converted, y = AvgTerm, fill = Converted)) +
  geom_boxplot() +
  theme_minimal()

AvgTerm_histogram | AvgTerm_boxplot

WghtAvgContractRate


WghtAvgContractRate_histogram <- tcu_cleaned_7 %>% 
  ggplot(aes(x = WghtAvgContractRate, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(title = "WghtAvgContractRate") +
  theme(legend.position = "none")

WghtAvgContractRate_boxplot <- tcu_cleaned_7 %>% 
  ggplot(aes(x = Converted, y = WghtAvgContractRate, fill = Converted)) +
  geom_boxplot() +
  theme_minimal()

WghtAvgContractRate_histogram | WghtAvgContractRate_boxplot

InitialLTV


InitialLTV_histogram <- tcu_cleaned_7 %>% 
  filter(InitialLTV <= 200) %>% 
  ggplot(aes(x = InitialLTV, fill = Converted)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(title = "InitialLTV") +
  theme(legend.position = "none")

InitialLTV_boxplot <- tcu_cleaned_7 %>% 
  filter(InitialLTV <= 200) %>% 
  ggplot(aes(x = Converted, y = InitialLTV, fill = Converted)) +
  geom_boxplot() +
  theme_minimal()

InitialLTV_histogram | InitialLTV_boxplot

Correlation Matrix


library(ggcorrplot)
numericVars <- which(sapply(tcu_cleaned_7, FUN = is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names for use later on

cat("There are", length(numericVarNames), "numeric variables")

There are 40 numeric variables

categoricalVars <- which(sapply(tcu_cleaned_7, FUN = is.factor)) #index vector numeric variables
categoricalNames <- names(categoricalVars) #saving names for use later on
length(categoricalNames)

[1] 10

#categoricalNames
tcu_numericVars <- tcu_cleaned_7[, numericVars]
corr <- cor(tcu_numericVars, use = 'pairwise.complete.obs')
#corr[,'SumLoanPaymentTotals']
#corr[,'WghtAvgContractRate']
corr_df <- as.data.frame(corr)

new <- corr_df %>% 
  filter_all(any_vars (. >= 0.5))

ggcorrplot(corr, lab = FALSE)

Categorical Variable

Categorical variable overview.


tcu_cleaned_7 %>%
  dplyr::select(names(.)[23:56]) %>% 
  purrr::keep(is.numeric) %>%
  gather() %>%
  ggplot() +
  geom_histogram(mapping = aes(x=value,fill=key), color="black") +
  facet_wrap(~ key, scales = "free") +
  theme_minimal() +
  theme(legend.position = "none")


tcu_cleaned_7 %>%
  purrr::keep(is.factor) %>%
  gather() %>%
  group_by(key,value) %>% 
  summarise(n = n()) %>% 
  ggplot() +
  geom_bar(mapping=aes(x = value, y = n, fill=key), color="black", stat='identity') + 
  coord_flip() +
  facet_wrap(~ key, scales = "free") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(x = "")


tcu_cleaned_8 <- tcu_cleaned_7 %>% 
  dplyr::select(-OriginationMonth, -BorrowerCreditScore, -Employer1Name, -EngagementChangeMonth, -ChangeEngagementSegment,
                -ChangeBehavioralSegment, -VehicleMake, -VehicleModel, -VehicleYear)

tcu_cleaned_8 <- tcu_cleaned_8 %>% 
  select(-OrigZip, -SumScheduledPmts, -OrigBalanceIndirectLoans, -WghtAvgContractRate)

tcu_cleaned_8 %>% 
  count(Converted) %>% 
  mutate(pct = prop.table(n)) %>% 
  ggplot(aes(x = Converted, y = pct, fill = Converted, label = scales::percent(pct))) +
  geom_col() +
  geom_text(vjust = -0.5) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()

Chi-square test

Welcome letter


chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$WelcomeLetter)

    Pearson's Chi-squared test with Yates' continuity correction

data:  tcu_cleaned_8$Converted and tcu_cleaned_8$WelcomeLetter
X-squared = 963.6, df = 1, p-value < 2.2e-16

PreapprovalLetter


chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$PreapprovalLetter)

    Pearson's Chi-squared test with Yates' continuity correction

data:  tcu_cleaned_8$Converted and tcu_cleaned_8$PreapprovalLetter
X-squared = 173.37, df = 1, p-value < 2.2e-16

BKHist


chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$BKHist)

    Pearson's Chi-squared test

data:  tcu_cleaned_8$Converted and tcu_cleaned_8$BKHist
X-squared = 558.76, df = 2, p-value < 2.2e-16

CollHist


chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$CollHist)

    Pearson's Chi-squared test

data:  tcu_cleaned_8$Converted and tcu_cleaned_8$CollHist
X-squared = 799.71, df = 2, p-value < 2.2e-16

CollHist2Yr


chisq.test(tcu_cleaned_8$Converted,tcu_cleaned_8$CollHist2Yr)

    Pearson's Chi-squared test

data:  tcu_cleaned_8$Converted and tcu_cleaned_8$CollHist2Yr
X-squared = 512.43, df = 2, p-value < 2.2e-16

library(ggmosaic)
ggplot(data = tcu_cleaned_8) +
  geom_mosaic(aes(x = product(WelcomeLetter), fill=Converted), na.rm=TRUE)+
  theme_bw()+
  theme(legend.position = "none")+
  labs(x="WelcomeLetter")+
  labs(y="Converted")


ggplot(data = tcu_cleaned_8) +
  geom_mosaic(aes(x = product(BKHist), fill=Converted), na.rm=TRUE)+
  theme_bw()+
  theme(legend.position = "none")+
  labs(x="BKHist")+
  labs(y="Converted")


ggplot(data = tcu_cleaned_8) +
  geom_mosaic(aes(x = product(CollHist), fill=Converted), na.rm=TRUE)+
  theme_bw()+
  theme(legend.position = "none")+
  labs(x="CollHist")+
  labs(y="Converted")


ggplot(data = tcu_cleaned) +
  geom_mosaic(aes(x = product(CollHist2Yr), fill=Converted), na.rm=TRUE)+
  theme_bw()+
  theme(legend.position = "none")+
  labs(x="CollHist2Yr")+
  labs(y="Converted")


ggplot(data = tcu_cleaned_8) +
  geom_mosaic(aes(x = product(PreapprovalLetter), fill=Converted), na.rm=TRUE)+
  theme_bw()+
  theme(legend.position = "none")+
  labs(x="PreapprovalLetter")+
  labs(y="Converted")

Predictive Modeling

Data Parition

We use the undersampling method to handle the data imbalanced problem.


library(caret)
set.seed(7231)
tcu_cleaned_converted <- tcu_cleaned_8[tcu_cleaned_8$Converted==1, ]
tcu_cleaned_nonconverted <- tcu_cleaned_8[tcu_cleaned_8$Converted==0, ]

tcu_nonconvertedsample <- caret::createDataPartition(y=tcu_cleaned_nonconverted$Converted,p=0.05,list = FALSE)
tcu_nonconverted <- tcu_cleaned_nonconverted[tcu_nonconvertedsample,]


df3 <- rbind(tcu_nonconverted,tcu_cleaned_converted)
set.seed(1234)
rows <- sample(nrow(df3))
df3 <- df3[rows,]
round(prop.table(table(df3$Converted)),3)

    0     1 
0.488 0.512 

set.seed(1234)
sample.set <- df3 %>%
  pull(.) %>%
  caTools::sample.split(SplitRatio = .7)


tcuTrain3 <- subset(df3, sample.set == TRUE)
tcuTest3 <- subset(df3, sample.set == FALSE)
round(prop.table(table(tcuTrain3$Converted)),3)

    0     1 
0.488 0.512 

Logistic Regression


#df3
library(tidyverse)
library(caret)
library(DMwR)
library(rpart)
library(ROCR)
library(randomForest)
library(xgboost)
library(caTools)
library(rpart.plot)
logit.mod <-
  glm(Converted ~ ., family = binomial(link = 'logit'), data = tcuTrain3)
logit.pred.prob <- predict(logit.mod, tcuTest3, type = 'response')
logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "1", "0"))

test <- tcuTest3$Converted
pred <- logit.pred
prob <- logit.pred.prob

roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Converted Prediction Approaches", col = 2, lwd = 2)


accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#comparisons

Decision Tree


set.seed(1234)
#grid search
ctrl <-
  trainControl(method = "cv",
               number = 10,
               selectionFunction = "oneSE")
grid <-
  expand.grid(
    .model = "tree",
    .trials = c(1, 5, 10, 15, 20, 25, 30, 35),
    .winnow = FALSE
  )
grid <-
  expand.grid(
    .cp = seq(from=0.0001, to=0.02, by=0.0001)
  )

set.seed(1234)
tree.mod3 <-
  train(
    Converted ~ .,
    data = tcuTrain3,
    method = "rpart",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid,
    na.action=na.omit
  )
#tree.mod3
library(rpart)
library(rpart.plot)
#get optimal cp

tree_modbest <-
  rpart(
    Converted ~ .,
    method = "class",
    data = tcuTrain3,
    control = rpart.control(cp = 0.0069)
  )
#save(tree_modbest,file = "/Volumes/GoogleDrive/Shared drives/ND Capstone - TCU/04-01-2020/tree_model.RData")

tree.pred <- predict(tree_modbest, tcuTest3, type = "class")
tree.pred.prob <- predict(tree_modbest, tcuTest3, type = "prob")

test <- tcuTest3$Converted
pred <- tree.pred
prob <- tree.pred.prob[,c("1")]


roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")


accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Classification Tree", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
#from the summary, the first split is welcome letter>=1
#comparisons

Random Forest


grid <- expand.grid(.mtry = c(3, 6, 9))
ctrl <-
  trainControl(method = "cv",
               number = 3,
               selectionFunction = "best")
set.seed(1234)
rf.mod <-
  train(
    Converted ~ .,
    data = tcuTrain3,
    method = "rf",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid,
    na.action=na.omit
  )

rf.pred <- predict(rf.mod, tcuTest3, type = "raw")
rf.pred.prob <- predict(rf.mod, tcuTest3, type = "prob")

test <- tcuTest3$Converted
pred <- rf.pred
prob <- rf.pred.prob[,c("1")]


roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")


accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Random Forest", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 
#comparisons

XGboost


ctrl <-
  trainControl(method = "cv",
               number = 3,
               selectionFunction = "best")

grid <- expand.grid(
  nrounds = 20,
  max_depth = c(4, 6, 8),
  eta =  c(0.1, 0.3, 0.5),
  gamma = 0.01,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = c(0.5, 1)
)
set.seed(1234)
xgb.mod <-
  train(
    Converted ~ .,
    data = tcuTrain3,
    method = "xgbTree",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid,
    na.action=na.omit
  )


xgb.pred <- predict(xgb.mod, tcuTest3, type = "raw")
xgb.pred.prob <- predict(xgb.mod, tcuTest3, type = "prob")

test <- tcuTest3$Converted
pred <- xgb.pred
prob <- xgb.pred.prob[,c("1")]


roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")


accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Extreme Gradient Boosting", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)

Model Comparison


comparisons

# A tibble: 4 x 5
  approach                  accuracy fmeasure kappa   auc
  <chr>                        <dbl>    <dbl> <dbl> <dbl>
1 Logistic Regression          0.633    0.636 0.266 0.686
2 Classification Tree          0.618    0.634 0.236 0.653
3 Random Forest                0.633    0.646 0.265 0.686
4 Extreme Gradient Boosting    0.639    0.654 0.278 0.695

The four models have similar performance. Therefore, we chose classification tree as it is easy to interpret and help us design business rules.

We use dt model to predict the probability of customers getting converted.


tree.pred4 <- predict(tree_modbest, tcu_cleaned_8, type = "prob")

prob <- tree.pred4[,c("1")]
tcu <- data.frame(tcu_cleaned_8,prob)

dt.pred.proball <- predict(tree_modbest, tcu_cleaned_8, type = "prob")
prob <- dt.pred.proball[,c("1")]
tcu <- data.frame(tcu_cleaned_8,prob)


final <- merge(tcu,tcu_cleaned,by.x =c("InitialLTV","SumLoanPaymentTotals","DecisionCreditScore","MonthlyIncome"),
               by.y = c("InitialLTV","SumLoanPaymentTotals","DecisionCreditScore","MonthlyIncome"),all.y = TRUE
)


final <- final[!is.na(final$prob),]

tcu_list <- final %>% 
  filter(Converted.x == 0) %>% 
  dplyr::select(BaseXPMembershipID,prob)

write to csv


#write.csv(tcu_list,"tcu_list")

K-Means Clustering

Based on Converted Customers.


tcu_k_means <- tcu_cleaned_7 %>% 
  select(DecisionCreditScore,WghtAvgContractRate,AvgDaysToFirstPayment,InitialLTV,MonthlyIncome,
         AgeAtOrigination,OrigBalanceIndirectLoans,SumScheduledPmts,SumLoanPaymentTotals,total_expense,
         AvgTerm,TrdBankCard,WelcomeLetter,TrdOil,TrdAuto,TrdOther,
         TrdOthFin,TrdDeptStore,ExtendedWarranties,TrdBank,TrdMortgage,Converted)

tcu_k_means_converted <- tcu_k_means %>% 
  select(-WelcomeLetter) %>% 
  filter(Converted == 1) %>% 
  na.omit()

tcu_k_means_converted = scale(tcu_k_means_converted %>% select(-Converted))

summary(tcu_k_means_converted) 

 DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
 Min.   :-3.9690     Min.   :-2.0858     Min.   :-1.7439      
 1st Qu.:-0.7649     1st Qu.:-0.6923     1st Qu.:-1.3477      
 Median :-0.1274     Median :-0.1827     Median : 0.5014      
 Mean   : 0.0000     Mean   : 0.0000     Mean   : 0.0000      
 3rd Qu.: 0.8288     3rd Qu.: 0.4660     3rd Qu.: 0.5014      
 Max.   : 2.0198     Max.   : 6.9528     Max.   : 6.7090      
   InitialLTV       MonthlyIncome     AgeAtOrigination  
 Min.   :-0.08868   Min.   :-1.0765   Min.   :-1.99981  
 1st Qu.:-0.02054   1st Qu.:-0.6701   1st Qu.:-0.80289  
 Median :-0.00863   Median :-0.2637   Median : 0.04198  
 Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
 3rd Qu.: 0.00041   3rd Qu.: 0.3459   3rd Qu.: 0.74605  
 Max.   :90.18255   Max.   : 6.1577   Max.   : 3.28069  
 OrigBalanceIndirectLoans SumScheduledPmts  SumLoanPaymentTotals
 Min.   :-2.1235          Min.   :-2.6509   Min.   :-2.1012     
 1st Qu.:-0.7035          1st Qu.:-0.7123   1st Qu.:-0.7050     
 Median :-0.1506          Median :-0.1676   Median :-0.1444     
 Mean   : 0.0000          Mean   : 0.0000   Mean   : 0.0000     
 3rd Qu.: 0.5645          3rd Qu.: 0.5386   3rd Qu.: 0.5514     
 Max.   : 5.6967          Max.   : 5.8294   Max.   : 5.9063     
 total_expense         AvgTerm         TrdBankCard     
 Min.   :-0.60141   Min.   :-5.5382   Min.   :-1.1116  
 1st Qu.:-0.18587   1st Qu.:-0.7541   1st Qu.:-0.6650  
 Median :-0.09353   Median : 0.4419   Median :-0.2184  
 Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.: 0.19272   3rd Qu.: 0.7409   3rd Qu.: 0.4515  
 Max.   :61.00133   Max.   : 1.6379   Max.   :10.2763  
     TrdOil           TrdAuto           TrdOther      
 Min.   :-0.3262   Min.   :-0.9910   Min.   :-0.7544  
 1st Qu.:-0.3262   1st Qu.:-0.5665   1st Qu.:-0.7544  
 Median :-0.3262   Median :-0.1419   Median :-0.1836  
 Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.:-0.3262   3rd Qu.: 0.2826   3rd Qu.: 0.3871  
 Max.   :14.5016   Max.   : 7.5002   Max.   :15.5116  
   TrdOthFin        TrdDeptStore     ExtendedWarranties
 Min.   :-0.7163   Min.   :-0.8099   Min.   :-1.097    
 1st Qu.:-0.7163   1st Qu.:-0.8099   1st Qu.:-1.097    
 Median :-0.4471   Median :-0.3280   Median : 0.898    
 Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.000    
 3rd Qu.: 0.3605   3rd Qu.: 0.4751   3rd Qu.: 0.898    
 Max.   : 8.1670   Max.   : 6.8998   Max.   : 2.893    
    TrdBank         TrdMortgage     
 Min.   :-0.5653   Min.   :-0.8990  
 1st Qu.:-0.5653   1st Qu.:-0.8990  
 Median :-0.5653   Median :-0.4037  
 Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.: 0.4304   3rd Qu.: 0.5867  
 Max.   :16.3617   Max.   : 8.0154  

ncol(tcu_k_means_converted)

[1] 20

We got 20 variables. As a rule of thumb, we use sqrt(n/2) as a starting point. In this case, it would be 3.


set.seed(1234)
k_3 <- kmeans(tcu_k_means_converted, centers=3, nstart = 25)
k_3$size

[1] 1991 2768 3381

k_3$centers

  DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
1          -0.0979945         -0.08322341            0.09794321
2           0.6235013         -0.46268783           -0.09351241
3          -0.4527491          0.42780766            0.01888123
    InitialLTV MonthlyIncome AgeAtOrigination
1 -0.006225711    0.52267366       0.03537167
2 -0.018844764    0.06726442       0.47981243
3  0.019094261   -0.36286044      -0.41364857
  OrigBalanceIndirectLoans SumScheduledPmts SumLoanPaymentTotals
1                1.3323512        1.2841754            1.3166601
2               -0.5505496       -0.4892673           -0.6108832
3               -0.3338628       -0.3556644           -0.2752279
  total_expense    AvgTerm TrdBankCard      TrdOil    TrdAuto
1   0.181580591  0.4814631   0.0794628 -0.01750233  0.4432106
2   0.004420715 -0.5561747   0.6135295  0.28006800  0.1505790
3  -0.110548209  0.1718126  -0.5490861 -0.21898287 -0.3842754
    TrdOther   TrdOthFin TrdDeptStore ExtendedWarranties    TrdBank
1  0.1556281  0.16107022   0.03104751         0.20862369  0.1040744
2  0.1263763 -0.04640037   0.39475870        -0.27033360  0.2800284
3 -0.1951095 -0.05686323  -0.34146929         0.09846603 -0.2905444
  TrdMortgage
1   0.1437406
2   0.5668903
3  -0.5487548

library(factoextra)
fviz_cluster(k_3, data = tcu_k_means_converted)


# Let's see how varying the number of clusters affects the results.
k_4 <- kmeans(tcu_k_means_converted, centers = 4, nstart = 25)
k_5 <- kmeans(tcu_k_means_converted, centers = 5, nstart = 25)
k_6 <- kmeans(tcu_k_means_converted, centers = 6, nstart = 25)

# Plot and compare the results.
p1 <- fviz_cluster(k_3, geom = "point",  data = tcu_k_means_converted) + ggtitle("k = 3")
p2 <- fviz_cluster(k_4, geom = "point",  data = tcu_k_means_converted) + ggtitle("k = 4")
p3 <- fviz_cluster(k_5, geom = "point",  data = tcu_k_means_converted) + ggtitle("k = 5")
p4 <- fviz_cluster(k_6, geom = "point",  data = tcu_k_means_converted) + ggtitle("k = 6")

library(gridExtra) 
grid.arrange(p1, p2, p3, p4, nrow = 2)


wcss <- vector()

# ... then specify the loop that generates the values.
n = 20
set.seed(1234)
for(k in 1:n) {
  wcss[k] <- sum(kmeans(tcu_k_means_converted, k)$withinss)
}

wcss

 [1] 162780.00 145691.12 134792.38 129051.95 124750.56 121314.04
 [7] 118522.71 114970.77 109303.47 105363.97 107395.73 105200.90
[13] 103700.98  96419.28 100129.89  99783.08  98031.89  96494.71
[19]  95581.65  94746.20

# Visualize the values of WCSS as they relate to number of clusters
tibble(value = wcss) %>%
  ggplot(mapping=aes(x=seq(1,length(wcss)), y=value)) +
  geom_point()+
  geom_line() +
  labs(title = "The Elbow Method", y = "WCSS", x = "Number of Clusters (k)" ) +
  theme_minimal() 


# According to the elbow method, we should choose 10 clusters.
k_10 <- kmeans(tcu_k_means_converted, centers = 10, nstart = 25, iter.max = 30)
k_10$centers

   DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
1          -0.59707944          0.31124978           0.178117369
2          -0.27756955          0.07508951          -0.023471386
3           0.33628211         -0.42660130           0.032571571
4           0.67436393         -0.36423994          -0.069363917
5           0.06598318         -0.27272917           0.114950170
6          -0.28196961          0.02423280           0.070295606
7          -1.44405543          2.28782095          -0.005310404
8          -0.22522763         -0.02014959           0.011096255
9           0.84421759         -0.57540761          -0.246093767
10         -1.44147835          1.53169822           0.501401287
     InitialLTV MonthlyIncome AgeAtOrigination
1  -0.002900525   -0.03442677       -0.2032114
2  -0.012262867   -0.08069914        0.6093515
3  -0.013714418    0.53822037        0.2519443
4  -0.023989595   -0.01412350        0.3044887
5  -0.007203508    1.08977144        0.1277190
6  -0.006602262   -0.18383600       -0.6084900
7  -0.007987302   -0.45451545       -0.3308641
8  -0.009001840   -0.40167804       -0.7831764
9  -0.014084744   -0.21702592        0.8222129
10 30.046975526    0.60278898        0.2297352
   OrigBalanceIndirectLoans SumScheduledPmts SumLoanPaymentTotals
1                0.75394612       0.66772688            0.8325553
2               -0.24551118      -0.24477049           -0.2337145
3               -0.05837050      -0.08093904           -0.1210659
4               -1.05993068      -0.79791938           -1.1086281
5                2.07219818       2.07587422            1.9964984
6               -0.07892526      -0.17239726           -0.0617731
7               -0.62656811      -0.35058122           -0.3800873
8               -0.49454637      -0.58626698           -0.4856928
9               -0.04019152      -0.13571835           -0.1215228
10               0.46531718       0.85797657            0.8009348
   total_expense     AvgTerm  TrdBankCard       TrdOil     TrdAuto
1    -0.01621926  0.66621442 -0.410394336 -0.203023108 -0.01366398
2    -0.03030366 -0.06067671  0.751604183  3.940916602  0.28677614
3     0.15665460 -0.04765249  1.313113192  0.008754065  0.76079949
4    -0.03357277 -1.50088056  0.104099619 -0.112779395 -0.09261898
5     0.23335347  0.38140072  0.123172159 -0.071895240  0.59690223
6    -0.04817686  0.33929628 -0.130826385 -0.145628003 -0.03062752
7    -0.11559744 -0.19496418 -0.551598333 -0.237147326 -0.20615814
8    -0.11889751  0.19072431 -0.569406057 -0.250354665 -0.49006615
9    -0.07966413  0.13338369 -0.008809614 -0.174689395 -0.24405517
10   36.86208019  0.30897678 -0.292839143  0.123095733 -0.28344055
     TrdOther   TrdOthFin TrdDeptStore ExtendedWarranties     TrdBank
1  -0.2229764 -0.15223557  -0.25474512         0.36024460 -0.23482542
2   0.1304586  0.17608253   0.75875800         0.09481819  0.82640979
3   0.3873875  0.09291122   0.89242092        -0.20263670  0.73919797
4  -0.1770579 -0.23135844  -0.10759834        -0.77347978  0.01137375
5   0.1021508  0.07020825  -0.06530054         0.08436376  0.13052389
6   1.7092130  2.27355955   0.13948428         0.05738641 -0.08969494
7  -0.3032368 -0.21205767  -0.35009262         0.14163722 -0.20166435
8  -0.3242320 -0.28114299  -0.37099803        -0.08079298 -0.29580149
9  -0.2094304 -0.31797884   0.01594123         0.36319081 -0.16374347
10 -0.2787531  0.53995272  -0.80988084         0.23300717 -0.23341143
   TrdMortgage
1  -0.40719310
2   0.43560386
3   1.16765429
4   0.25880458
5   0.29779714
6  -0.31238464
7  -0.47436688
8  -0.60300887
9   0.03543928
10 -0.56882237

fviz_cluster(k_10, geom = "point",  data = tcu_k_means_converted) + ggtitle("k = 10")


converted_customers <- tcu_k_means %>% filter(Converted == 1) %>% select(-WelcomeLetter)

converted_customers$cluster <- k_10$cluster

converted_customers %>%
  group_by(cluster) %>%
  summarize(DecisionCreditScore = mean(DecisionCreditScore),
            WghtAvgContractRate = mean(WghtAvgContractRate),
            AvgDaysToFirstPayment = mean(AvgDaysToFirstPayment),
            InitialLTV = mean(InitialLTV),
            MonthlyIncome = mean(MonthlyIncome),
            AgeAtOrigination = mean(AgeAtOrigination),
            OrigBalanceIndirectLoans = mean(OrigBalanceIndirectLoans),
            SumScheduledPmts = mean(SumScheduledPmts),
            SumLoanPaymentTotals = mean(SumLoanPaymentTotals),
            total_expense = mean(total_expense),
            AvgTerm = mean(AvgTerm)) %>% 
  as.data.frame()

   cluster DecisionCreditScore WghtAvgContractRate
1        1            694.0023            5.665988
2        2            713.0487            5.156299
3        3            749.6412            4.073533
4        4            769.7946            4.208123
5        5            733.5283            4.405625
6        6            712.7864            5.046538
7        7            643.5130            9.931889
8        8            716.1689            4.950751
9        9            779.9198            3.752374
10      10            643.6667            8.300000
   AvgDaysToFirstPayment  InitialLTV MonthlyIncome AgeAtOrigination
1               42.55229   100.44109      4564.117         43.51743
2               41.02597    89.47858      4450.263         55.05844
3               41.45030    87.77893      5973.128         49.98211
4               40.67850    75.74757      4614.074         50.72841
5               42.07402    95.40266      7330.231         48.21771
6               41.73592    96.10667      4196.493         37.76117
7               41.16348    94.48490      3530.480         41.70435
8               41.28770    93.29697      3660.488         35.28006
9               39.34041    87.34531      4114.828         58.08176
10              45.00000 35286.31304      6132.000         49.66667
   OrigBalanceIndirectLoans SumScheduledPmts SumLoanPaymentTotals
1                  28655.49         460.3555             34110.02
2                  19501.04         333.7753             22615.48
3                  21215.14         356.5017             23829.85
4                  12041.43         257.0433             13183.79
5                  40729.91         655.6916             46657.49
6                  21026.87         343.8148             24469.03
7                  16010.78         319.0974             21037.56
8                  17220.03         286.4034             19899.12
9                  21381.65         348.9028             23824.92
10                 26011.82         486.7467             33769.15
   total_expense  AvgTerm
1       633.7266 74.25097
2       618.4740 66.95779
3       820.9384 67.08847
4       614.9338 52.50768
5       903.9985 71.39332
6       599.1184 70.97087
7       526.1061 65.61043
8       522.5323 69.48019
9       565.0197 68.90487
10    40570.6667 70.66667

cluster_rows = converted_customers %>% 
  group_by(cluster) %>% 
  summarise(n = n()) 

cluster_rows %>% 
  mutate(cluster = as.factor(cluster),
         percent = n/sum(n)) 

# A tibble: 10 x 3
   cluster     n  percent
   <fct>   <int>    <dbl>
 1 1        1291 0.159   
 2 2         308 0.0378  
 3 3        1006 0.124   
 4 4        1042 0.128   
 5 5         689 0.0846  
 6 6         515 0.0633  
 7 7         575 0.0706  
 8 8        1439 0.177   
 9 9        1272 0.156   
10 10          3 0.000369

head(tcu_k_means)

  DecisionCreditScore WghtAvgContractRate AvgDaysToFirstPayment
1                 762                4.90                    45
2                 716                5.84                    46
3                 781                4.40                    45
4                 790                4.50                    45
5                 751                3.90                    45
6                 700                6.99                    44
  InitialLTV MonthlyIncome AgeAtOrigination OrigBalanceIndirectLoans
1   77.84608          4166               25                 15499.60
2  100.40390          4892               76                 18726.98
3  116.05114          5416               31                 19639.35
4   57.89504          4583               50                 11373.69
5   69.14370          3061               23                 12112.18
6   83.85382          4128               47                 11625.32
  SumScheduledPmts SumLoanPaymentTotals total_expense AvgTerm
1           292.37             17542.20             0      60
2           299.29             22446.75           672      75
3           365.85             21951.00          1200      60
4           212.41             12744.60          1145      60
5           222.85             13371.00           648      60
6           198.72             14307.84           912      72
  TrdBankCard WelcomeLetter TrdOil TrdAuto TrdOther TrdOthFin
1           5             0      0       1        5         5
2           1             0      0       3        2        21
3          15             0      0       6        1         5
4           5             0      1       1        0         1
5          13             0      2       1        4         6
6           9             0      1       3        1         4
  TrdDeptStore ExtendedWarranties TrdBank TrdMortgage Converted
1            3                  0       1           1         0
2            0                  0       1           5         0
3            4                  0       1           2         0
4            2                  1       1           8         0
5            1                  0       3           4         0
6            6                  0       9           5         0

Discussion: The most valuable customer clusters are cluster 3 and 5 for the following reasons.